Introduction

This analysis examines simulated income data for Hungary, focusing on the relationships between income and various demographic factors including age, location, occupation, and gender. The dataset is simulated to reflect realistic patterns while maintaining a manageable size for analysis.

Data Simulation

set.seed(123) # For reproducibility

# Define parameters
n <- 10000  # Number of observations
cities <- c("Budapest", "Debrecen", "Szeged", "Miskolc", "Pécs", "Győr", "Szombathely", "Eger")
occupations <- c("Software Developer", "Teacher", "Doctor", "Sales Representative", 
                "Engineer", "Accountant", "Nurse", "Manager", "Chef", "Driver")
genders <- c("Male", "Female")

# Generate data
data <- data.frame(
  age = round(rnorm(n, mean = 40, sd = 12)),
  city = sample(cities, n, replace = TRUE),
  occupation = sample(occupations, n, replace = TRUE),
  gender = sample(genders, n, replace = TRUE)
)

# Ensure age is between 18 and 70
data$age <- pmax(18, pmin(70, data$age))

# Generate income based on relationships
data <- data %>%
  mutate(
    # Base income
    base_income = 300000,
    
    # Age effect (quadratic relationship)
    age_effect = 10000 * (age - 40) - 100 * (age - 40)^2,
    
    # City effect
    city_effect = case_when(
      city == "Budapest" ~ 100000,
      city %in% c("Debrecen", "Szeged") ~ 50000,
      TRUE ~ 0
    ),
    
    # Occupation effect
    occupation_effect = case_when(
      occupation == "Software Developer" ~ 150000,
      occupation == "Doctor" ~ 120000,
      occupation == "Manager" ~ 100000,
      occupation == "Engineer" ~ 80000,
      occupation == "Accountant" ~ 60000,
      occupation == "Teacher" ~ 40000,
      occupation == "Nurse" ~ 30000,
      occupation == "Chef" ~ 20000,
      occupation == "Driver" ~ 10000,
      TRUE ~ 0
    ),
    
    # Gender effect (simulating gender pay gap)
    gender_effect = ifelse(gender == "Male", 20000, 0),
    
    # Random noise
    noise = rnorm(n, 0, 50000)
  ) %>%
  mutate(
    income = base_income + age_effect + city_effect + occupation_effect + gender_effect + noise
  ) %>%
  select(age, city, occupation, gender, income)

# Save the data
write.csv(data, "hungarian_income_data.csv", row.names = FALSE)

Descriptive Statistics

# Summary statistics with kable
summary_stats <- summary(data)
kable(summary_stats, caption = "Summary Statistics of the Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary Statistics of the Dataset
age city occupation gender income
Min. :18.0 Length:10000 Length:10000 Length:10000 Min. :-112471
1st Qu.:32.0 Class :character Class :character Class :character 1st Qu.: 289781
Median :40.0 Mode :character Mode :character Mode :character Median : 389012
Mean :40.1 NA NA NA Mean : 382371
3rd Qu.:48.0 NA NA NA 3rd Qu.: 479762
Max. :70.0 NA NA NA Max. : 843293
# Interactive income distribution by gender
p1 <- ggplot(data, aes(x = income, fill = gender)) +
  geom_density(alpha = 0.6) +
  scale_fill_viridis_d() +
  labs(title = "Income Distribution by Gender",
       subtitle = "Density plot showing the distribution of income across genders",
       x = "Income (HUF)",
       y = "Density") +
  custom_theme

ggplotly(p1)
# Interactive income by occupation with jitter
p2 <- ggplot(data, aes(x = reorder(occupation, income, FUN = median), y = income, color = occupation)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(alpha = 0.1, width = 0.2) +
  scale_color_viridis_d() +
  coord_flip() +
  labs(title = "Income Distribution by Occupation",
       subtitle = "Boxplot with individual data points",
       x = "Occupation",
       y = "Income (HUF)") +
  custom_theme

ggplotly(p2)
# Interactive income by city with violin plot
p3 <- ggplot(data, aes(x = reorder(city, income, FUN = median), y = income, fill = city)) +
  geom_violin(alpha = 0.7) +
  geom_boxplot(width = 0.2, alpha = 0.5) +
  scale_fill_viridis_d() +
  coord_flip() +
  labs(title = "Income Distribution by City",
       subtitle = "Violin plot with boxplot overlay",
       x = "City",
       y = "Income (HUF)") +
  custom_theme

ggplotly(p3)
# Interactive age vs income scatter plot with regression line
p4 <- ggplot(data, aes(x = age, y = income, color = gender)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = TRUE) +
  scale_color_viridis_d() +
  labs(title = "Relationship between Age and Income",
       subtitle = "Scatter plot with LOESS regression line",
       x = "Age",
       y = "Income (HUF)") +
  custom_theme

ggplotly(p4)
# Create a correlation heatmap
numeric_data <- data %>% select(age, income)
cor_matrix <- cor(numeric_data)
cor_df <- as.data.frame(cor_matrix)
cor_df$var1 <- rownames(cor_df)
cor_df_long <- pivot_longer(cor_df, -var1, names_to = "var2", values_to = "correlation")

ggplot(cor_df_long, aes(x = var1, y = var2, fill = correlation)) +
  geom_tile() +
  scale_fill_viridis() +
  labs(title = "Correlation Heatmap",
       subtitle = "Correlation between numeric variables",
       x = "",
       y = "") +
  custom_theme +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Hypothesis Testing

Parametric Tests

1. Gender Income Difference (t-test)

# Test if there's a significant difference in income between genders
t_test_result <- t.test(income ~ gender, data = data)
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  income by gender
## t = -5.9705, df = 9996.2, p-value = 2.446e-09
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -21941.72 -11095.24
## sample estimates:
## mean in group Female   mean in group Male 
##             374135.3             390653.8

2. City Income Differences (ANOVA)

# Test if there are significant differences in income between cities
city_anova <- aov(income ~ city, data = data)
print(summary(city_anova))
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## city           7 1.296e+13 1.851e+12   103.3 <2e-16 ***
## Residuals   9992 1.790e+14 1.792e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3. Occupation Income Differences (ANOVA)

# Test if there are significant differences in income between occupations
occupation_anova <- aov(income ~ occupation, data = data)
print(summary(occupation_anova))
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## occupation     9 2.342e+13 2.603e+12   154.2 <2e-16 ***
## Residuals   9990 1.686e+14 1.687e+10                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Non-parametric Tests

1. Gender Income Distribution (Kolmogorov-Smirnov Test)

# Test if income distributions differ between genders
male_income <- data$income[data$gender == "Male"]
female_income <- data$income[data$gender == "Female"]
ks_test <- ks.test(male_income, female_income)
print(ks_test)
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  male_income and female_income
## D = 0.048592, p-value = 1.492e-05
## alternative hypothesis: two-sided

2. Income Distribution by City (Kruskal-Wallis Test)

# Test if income distributions differ between cities
kruskal_test <- kruskal.test(income ~ city, data = data)
print(kruskal_test)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  income by city
## Kruskal-Wallis chi-squared = 638.84, df = 7, p-value < 2.2e-16

Regression Analysis

Multiple Linear Regression

# Convert categorical variables to factors
data$city <- as.factor(data$city)
data$occupation <- as.factor(data$occupation)
data$gender <- as.factor(data$gender)

# Fit the model
model1 <- lm(income ~ age + I(age^2) + city + occupation + gender, data = data)

# Create a beautiful model summary table
model_summary <- summary(model1)
kable(tidy(model_summary), caption = "Multiple Linear Regression Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Multiple Linear Regression Results
term estimate std.error statistic p.value
(Intercept) -96092.80004 5346.84928 -17.971855 0
age 17807.38096 247.50834 71.946590 0
I(age^2) -98.24611 2.98482 -32.915255 0
cityDebrecen -48896.22562 2037.36766 -23.999706 0
cityEger -100281.00819 2029.78290 -49.404795 0
cityGyőr -100731.39605 2020.96342 -49.843255 0
cityMiskolc -100072.32151 2013.43652 -49.702248 0
cityPécs -99161.76998 2015.48565 -49.199939 0
citySzeged -51320.31355 2009.39904 -25.540131 0
citySzombathely -100440.01966 2014.50213 -49.858483 0
occupationChef -38632.37074 2263.24122 -17.069489 0
occupationDoctor 61046.06725 2267.92997 26.917086 0
occupationDriver -48852.44645 2239.32545 -21.815697 0
occupationEngineer 21662.39493 2249.52488 9.629765 0
occupationManager 43139.45933 2253.96387 19.139375 0
occupationNurse -28367.40970 2252.92890 -12.591347 0
occupationSales Representative -62327.70195 2247.52268 -27.731734 0
occupationSoftware Developer 91320.22019 2267.71522 40.269704 0
occupationTeacher -16321.55449 2226.30713 -7.331223 0
genderMale 19165.10817 997.20292 19.218865 0
# Enhanced model diagnostics
par(mfrow = c(2,2))
plot(model1, col = income_palette[1], pch = 19, cex = 0.7)

Polynomial Regression for Age-Income Relationship

# Fit polynomial regression
model2 <- lm(income ~ poly(age, 3), data = data)

# Create an interactive plot
p5 <- ggplot(data, aes(x = age, y = income)) +
  geom_point(alpha = 0.1, color = income_palette[1]) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), 
              color = income_palette[5], fill = income_palette[5], alpha = 0.2) +
  labs(title = "Polynomial Regression: Age vs Income",
       subtitle = "Cubic polynomial fit with confidence interval",
       x = "Age",
       y = "Income (HUF)") +
  custom_theme

ggplotly(p5)
# Model comparison
model_comparison <- data.frame(
  Model = c("Multiple Linear", "Polynomial"),
  R_squared = c(summary(model1)$r.squared, summary(model2)$r.squared),
  Adj_R_squared = c(summary(model1)$adj.r.squared, summary(model2)$adj.r.squared)
)

kable(model_comparison, caption = "Model Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Model Comparison
Model R_squared Adj_R_squared
Multiple Linear 0.8710142 0.8707686
Polynomial 0.6818015 0.6817060

Predictions

# Create a sample prediction
new_data <- data.frame(
  age = 35,
  city = "Budapest",
  occupation = "Software Developer",
  gender = "Male"
)

# Predict income
prediction <- predict(model1, newdata = new_data, interval = "prediction")
print(prediction)
##        fit      lwr      upr
## 1 517299.4 419559.1 615039.7

Summary and Conclusions

The analysis of the simulated Hungarian income data reveals several interesting patterns:

  1. There is a significant gender pay gap, with males earning more on average than females.
  2. Income varies significantly across different cities, with Budapest showing the highest average income.
  3. Occupation has a strong effect on income, with software developers and doctors earning the most.
  4. Age shows a non-linear relationship with income, peaking in the middle age range.
  5. The regression models explain a significant portion of the income variation.

The analysis demonstrates the complex interplay between various factors affecting income levels in Hungary. The findings suggest that both demographic factors (age, gender) and professional factors (occupation, location) play important roles in determining income levels.

References

  • R Core Team (2023). R: A language and environment for statistical computing.
  • Wickham, H. (2016). ggplot2: Elegant Graphics for Data Analysis.
  • Fox, J. (2016). Applied Regression Analysis and Generalized Linear Models.